library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
baseball<-read.csv("baseball.csv")

#creating a new variable in our dataset for specified condition named *condition*: binary indicator variable indicating runner on 3rd with less than 2 outs

data <- baseball %>%
  mutate(condition = ifelse(outs_when_up < 2 & (!is.na(on_3b)), 1, 0))

#baseline filtering out of bunts/incorrectly recorded data
#assuming > 40mph is reasonable to assume, removing NA values

data <- data %>% 
  filter(bat_speed > 40) %>%
  filter(bat_speed != "NA")

#create new variable for average bat speed of each player
#create new variable for deviance of each swing compared to individual player's averages
#create new variable that records the standard deviation of each player's bat speed
#create new variable that indicates speed three standard deviations above and below a player's average bat speed

data <- data %>%
  group_by(player_name) %>%
  mutate(avg_bat_speed = mean(bat_speed, na.rm = TRUE),
         deviance = ifelse(!is.na(bat_speed), bat_speed - avg_bat_speed, NA),
  sd_bat_speed = sd(bat_speed, na.rm = TRUE),
  three_sd_above = avg_bat_speed + 3 * sd_bat_speed,
 three_sd_below = avg_bat_speed - 3 * sd_bat_speed
  )  %>%
  ungroup()

#filters out unreasonable outliers (laying outside the 3-standard deviation range) of each individual player's average bat speed.

data <- data %>% filter(bat_speed < three_sd_above) %>% filter(bat_speed > three_sd_below)

#modeling bat speed and swing length

library(lme4)
## Loading required package: Matrix
#utilizing (1 | player_name) as a random effect to account for player-to-player variability in bat_speed
#allows us to more robustly observe the effect of the condition on bat speed

bat_speed_model <- lmer(
  bat_speed ~ condition + (1 | player_name), 
  data = data 
)

#utilizing (1 | player_name) as a random effect to account for player-to-player variability in swing_length
#allows us to more robustly observe the effect of the condition on swing length

swing_length_model <- lmer(
  swing_length ~ condition + (1 | player_name), 
  data = data 
)

#read in results

summary(bat_speed_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: bat_speed ~ condition + (1 | player_name)
##    Data: data
## 
## REML criterion at convergence: 1852713
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.3557 -0.5542  0.1048  0.6579  3.4541 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  player_name (Intercept)  7.496   2.738   
##  Residual                22.546   4.748   
## Number of obs: 310705, groups:  player_name, 647
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 70.31310    0.10923  643.74
## condition   -0.51477    0.03907  -13.17
## 
## Correlation of Fixed Effects:
##           (Intr)
## condition -0.018
summary(swing_length_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: swing_length ~ condition + (1 | player_name)
##    Data: data
## 
## REML criterion at convergence: 727320.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.8935 -0.6808  0.0304  0.7274  6.3042 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  player_name (Intercept) 0.1583   0.3979  
##  Residual                0.6029   0.7764  
## Number of obs: 310705, groups:  player_name, 647
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  7.244184   0.015925 454.882
## condition   -0.043433   0.006389  -6.798
## 
## Correlation of Fixed Effects:
##           (Intr)
## condition -0.020
#function to assist visualization in further steps

create_circle <- function(center_x, center_y, radius, n_points = 100) {
  theta <- seq(0, 2 * pi, length.out = n_points)
  data.frame(
    x = center_x + radius * cos(theta),
    y = center_y + radius * sin(theta)
  )
}

library(ggplot2)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
#will help us view the highest density of high run expectancies in the data

filtered_data <- subset(data, delta_run_exp > 2.5)

#pinpoint location of the highest density of run expectancies over 2.5

density_estimate <- kde2d(
  x = filtered_data$deviance, 
  y = filtered_data$delta_run_exp, 
  n = 100
)

#steps to create circle to assist visualization aid 

max_density_idx <- which(density_estimate$z == max(density_estimate$z), arr.ind = TRUE)
center_x <- density_estimate$x[max_density_idx[1]]
center_y <- density_estimate$y[max_density_idx[2]]

circle_data <- create_circle(center_x, center_y, radius = 0.5)

#plots bat speed deviance against run expectancies, adding a circle to the location of the highest density of run expectancies of 2.5 -- allows us to visually determine strategy

ggplot(data, aes(x = deviance, y = delta_run_exp, color = inning)) +
  geom_point(alpha = 0.7) +  
  geom_polygon(data = circle_data, aes(x = x, y = y), inherit.aes = FALSE, 
               color = "red", fill = NA, size = 1) + 
  labs(
    title = "Relationship Between Bat Speed Deviance and Run Expectancy",
    x = "Deviance from Average Bat Speed",
    y = "Run Expectancy"
  ) + # Preserve aspect ratio
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

#view the bat speed plotted against location of the hit -- again, allows us to examine strategy possibilities

ggplot(data, aes(x = hc_x, y = hc_y, color = bat_speed)) +
  geom_point(alpha = 0.7, size = 3) +
  scale_color_viridis_c(option = "plasma", name = "Bat Speed") + 
  labs(
    title = "Hit Coordinates Colored by Bat Speed",
    x = "Hit Coordinate X (hc_x)",
    y = "Hit Coordinate Y (hc_y)"
  ) +
  theme_minimal()
## Warning: Removed 196266 rows containing missing values or values outside the scale range
## (`geom_point()`).

#same plots, specified for condition (runner on 3rd, less than 2 outs)

data3 <- data %>% filter(
  outs_when_up < 2 & (!is.na(on_3b))
)

filtered_data3 <- subset(data3, delta_run_exp > 2.5)

density_estimate3 <- kde2d(
  x = filtered_data3$deviance, 
  y = filtered_data3$delta_run_exp, 
  n = 100
)
max_density_idx3 <- which(density_estimate3$z == max(density_estimate3$z), arr.ind = TRUE)
center_x3 <- density_estimate3$x[max_density_idx3[1]]
center_y3 <- density_estimate3$y[max_density_idx3[2]]

circle_data3 <- create_circle(center_x3, center_y3, radius = 0.5)

ggplot(data3, aes(x = deviance, y = delta_run_exp, color = inning)) +
  geom_point(alpha = 0.7) +  
  geom_polygon(data = circle_data3, aes(x = x, y = y), inherit.aes = FALSE, 
               color = "red", fill = NA, size = 1) +
  labs(
    title = "Relationship Between Bat Speed Deviance and Run Expectancy",
    x = "Deviance from Average Bat Speed",
    y = "Run Expectancy"
  ) + # Preserve aspect ratio
  theme_minimal()

ggplot(data3, aes(x = hc_x, y = hc_y, color = bat_speed)) +
  geom_point(alpha = 0.7, size = 3) +
  scale_color_viridis_c(option = "plasma", name = "Bat Speed") + 
  labs(
    title = "Hit Coordinates Colored by Bat Speed",
    x = "Hit Coordinate X (hc_x)",
    y = "Hit Coordinate Y (hc_y)"
  ) +
  theme_minimal()
## Warning: Removed 9875 rows containing missing values or values outside the scale range
## (`geom_point()`).

#same as above, for win expectancies

#take absolute value, as win expectancies are negative for away teams

data <- data %>% mutate(delta_home_win_exp = abs(delta_home_win_exp))

filtered_data <- subset(data, delta_home_win_exp > .2)

#pinpoint location of the highest density of win expectancies over .2

density_estimate <- kde2d(
  x = filtered_data$deviance, 
  y = filtered_data$delta_home_win_exp, 
  n = 100
)

#steps to create circle to assist visualization aid 

max_density_idx <- which(density_estimate$z == max(density_estimate$z), arr.ind = TRUE)
center_x <- density_estimate$x[max_density_idx[1]]
center_y <- density_estimate$y[max_density_idx[2]]

circle_data <- create_circle(center_x, center_y, radius = 0.3)

#plots bat speed deviance against run expectancies, adding a circle to the location of the highest density of win expectancies of >.2 -- allows us to visually determine strategy

ggplot(data, aes(x = deviance, y = delta_home_win_exp, color = inning)) +
  geom_point(alpha = 0.7) +  
  geom_polygon(data = circle_data, aes(x = x, y = y), inherit.aes = FALSE, 
               color = "red", fill = NA, size = 1) + 
  labs(
    title = "Relationship Between Bat Speed Deviance and Run Expectancy",
    x = "Deviance from Average Bat Speed",
    y = "Run Expectancy"
  ) + 
  theme_minimal()

#same plots, specified for condition (runner on 3rd, less than 2 outs)

data3 <- data %>% filter(
  outs_when_up < 2 & (!is.na(on_3b))
)

filtered_data3 <- subset(data3, delta_home_win_exp > .2)

density_estimate3 <- kde2d(
  x = filtered_data3$deviance, 
  y = filtered_data3$delta_home_win_exp, 
  n = 100
)
max_density_idx3 <- which(density_estimate3$z == max(density_estimate3$z), arr.ind = TRUE)
center_x3 <- density_estimate3$x[max_density_idx3[1]]
center_y3 <- density_estimate3$y[max_density_idx3[2]]

circle_data3 <- create_circle(center_x3, center_y3, radius = 0.3)

ggplot(data3, aes(x = deviance, y = delta_home_win_exp, color = inning)) +
  geom_point(alpha = 0.7) +  
  geom_polygon(data = circle_data3, aes(x = x, y = y), inherit.aes = FALSE, 
               color = "red", fill = NA, size = 1) +
  labs(
    title = "Relationship Between Bat Speed Deviance and Win Expectancy",
    x = "Deviance from Average Bat Speed",
    y = "Run Expectancy"
  ) + 
  theme_minimal()

#creating a new variable in our dataset for specified condition named *condition*: binary indicator variable indicating 2 strikes and runners on bases.

data <- baseball %>%
  mutate(condition = ifelse(strikes == 2 & (!is.na(on_1b) | !is.na(on_2b) | !is.na(on_3b)), 1, 0))



data <- data %>% 
  filter(bat_speed > 40) %>%
  filter(bat_speed != "NA")



data <- data %>%
  group_by(player_name) %>%
  mutate(avg_bat_speed = mean(bat_speed, na.rm = TRUE),
         deviance = ifelse(!is.na(bat_speed), bat_speed - avg_bat_speed, NA),
  sd_bat_speed = sd(bat_speed, na.rm = TRUE),
  three_sd_above = avg_bat_speed + 3 * sd_bat_speed,
 three_sd_below = avg_bat_speed - 3 * sd_bat_speed
  )  %>%
  ungroup()


data <- data %>% filter(bat_speed < three_sd_above) %>% filter(bat_speed > three_sd_below)



bat_speed_model <- lmer(
  bat_speed ~ condition + (1 | player_name), 
  data = data 
)



swing_length_model <- lmer(
  swing_length ~ condition + (1 | player_name), 
  data = data 
)


summary(bat_speed_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: bat_speed ~ condition + (1 | player_name)
##    Data: data
## 
## REML criterion at convergence: 1848168
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.4251 -0.5534  0.1026  0.6558  3.6525 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  player_name (Intercept)  7.503   2.739   
##  Residual                22.218   4.714   
## Number of obs: 310705, groups:  player_name, 647
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 70.54120    0.10930  645.39
## condition   -1.60224    0.02323  -68.96
## 
## Correlation of Fixed Effects:
##           (Intr)
## condition -0.034
summary(swing_length_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: swing_length ~ condition + (1 | player_name)
##    Data: data
## 
## REML criterion at convergence: 726681.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.7868 -0.6850  0.0311  0.7253  6.2946 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  player_name (Intercept) 0.1583   0.3979  
##  Residual                0.6016   0.7756  
## Number of obs: 310705, groups:  player_name, 647
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  7.257893   0.015933   455.5
## condition   -0.100167   0.003823   -26.2
## 
## Correlation of Fixed Effects:
##           (Intr)
## condition -0.038
create_circle <- function(center_x, center_y, radius, n_points = 100) {
  theta <- seq(0, 2 * pi, length.out = n_points)
  data.frame(
    x = center_x + radius * cos(theta),
    y = center_y + radius * sin(theta)
  )
}

library(ggplot2)
library(MASS)


filtered_data <- subset(data, delta_run_exp > 2.5)


density_estimate <- kde2d(
  x = filtered_data$deviance, 
  y = filtered_data$delta_run_exp, 
  n = 100
)


max_density_idx <- which(density_estimate$z == max(density_estimate$z), arr.ind = TRUE)
center_x <- density_estimate$x[max_density_idx[1]]
center_y <- density_estimate$y[max_density_idx[2]]

circle_data <- create_circle(center_x, center_y, radius = 0.5)


ggplot(data, aes(x = deviance, y = delta_run_exp, color = inning)) +
  geom_point(alpha = 0.7) +  
  geom_polygon(data = circle_data, aes(x = x, y = y), inherit.aes = FALSE, 
               color = "red", fill = NA, size = 1) + 
  labs(
    title = "Relationship Between Bat Speed Deviance and Run Expectancy",
    x = "Deviance from Average Bat Speed",
    y = "Run Expectancy"
  ) + # Preserve aspect ratio
  theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(data, aes(x = hc_x, y = hc_y, color = bat_speed)) +
  geom_point(alpha = 0.7, size = 3) +
  scale_color_viridis_c(option = "plasma", name = "Bat Speed") + 
  labs(
    title = "Hit Coordinates Colored by Bat Speed",
    x = "Hit Coordinate X (hc_x)",
    y = "Hit Coordinate Y (hc_y)"
  ) +
  theme_minimal()
## Warning: Removed 196266 rows containing missing values or values outside the scale range
## (`geom_point()`).

data3 <- data %>% filter(
  outs_when_up < 2 & (!is.na(on_3b))
)

filtered_data3 <- subset(data3, delta_run_exp > 2.5)

density_estimate3 <- kde2d(
  x = filtered_data3$deviance, 
  y = filtered_data3$delta_run_exp, 
  n = 100
)
max_density_idx3 <- which(density_estimate3$z == max(density_estimate3$z), arr.ind = TRUE)
center_x3 <- density_estimate3$x[max_density_idx3[1]]
center_y3 <- density_estimate3$y[max_density_idx3[2]]

circle_data3 <- create_circle(center_x3, center_y3, radius = 0.5)

ggplot(data3, aes(x = deviance, y = delta_run_exp, color = inning)) +
  geom_point(alpha = 0.7) +  
  geom_polygon(data = circle_data3, aes(x = x, y = y), inherit.aes = FALSE, 
               color = "red", fill = NA, size = 1) +
  labs(
    title = "Relationship Between Bat Speed Deviance and Run Expectancy",
    x = "Deviance from Average Bat Speed",
    y = "Run Expectancy"
  ) + # Preserve aspect ratio
  theme_minimal()

ggplot(data3, aes(x = hc_x, y = hc_y, color = bat_speed)) +
  geom_point(alpha = 0.7, size = 3) +
  scale_color_viridis_c(option = "plasma", name = "Bat Speed") + 
  labs(
    title = "Hit Coordinates Colored by Bat Speed",
    x = "Hit Coordinate X (hc_x)",
    y = "Hit Coordinate Y (hc_y)"
  ) +
  theme_minimal()
## Warning: Removed 9875 rows containing missing values or values outside the scale range
## (`geom_point()`).

data <- data %>% mutate(delta_home_win_exp = abs(delta_home_win_exp))

filtered_data <- subset(data, delta_home_win_exp > .2)


density_estimate <- kde2d(
  x = filtered_data$deviance, 
  y = filtered_data$delta_home_win_exp, 
  n = 100
)


max_density_idx <- which(density_estimate$z == max(density_estimate$z), arr.ind = TRUE)
center_x <- density_estimate$x[max_density_idx[1]]
center_y <- density_estimate$y[max_density_idx[2]]

circle_data <- create_circle(center_x, center_y, radius = 0.3)


ggplot(data, aes(x = deviance, y = delta_home_win_exp, color = inning)) +
  geom_point(alpha = 0.7) +  
  geom_polygon(data = circle_data, aes(x = x, y = y), inherit.aes = FALSE, 
               color = "red", fill = NA, size = 1) + 
  labs(
    title = "Relationship Between Bat Speed Deviance and Run Expectancy",
    x = "Deviance from Average Bat Speed",
    y = "Run Expectancy"
  ) + 
  theme_minimal()

data3 <- data %>% filter(
  outs_when_up < 2 & (!is.na(on_3b))
)

filtered_data3 <- subset(data3, delta_home_win_exp > .2)

density_estimate3 <- kde2d(
  x = filtered_data3$deviance, 
  y = filtered_data3$delta_home_win_exp, 
  n = 100
)
max_density_idx3 <- which(density_estimate3$z == max(density_estimate3$z), arr.ind = TRUE)
center_x3 <- density_estimate3$x[max_density_idx3[1]]
center_y3 <- density_estimate3$y[max_density_idx3[2]]

circle_data3 <- create_circle(center_x3, center_y3, radius = 0.3)

ggplot(data3, aes(x = deviance, y = delta_home_win_exp, color = inning)) +
  geom_point(alpha = 0.7) +  
  geom_polygon(data = circle_data3, aes(x = x, y = y), inherit.aes = FALSE, 
               color = "red", fill = NA, size = 1) +
  labs(
    title = "Relationship Between Bat Speed Deviance and Win Expectancy",
    x = "Deviance from Average Bat Speed",
    y = "Run Expectancy"
  ) + 
  theme_minimal()